In this section, we load the vaccination and deaths datasets. The head() function is used to display the first few rows of each dataset to inspect the data.
import pandas as pd
# Load the vaccination data
vaccinations_path = 'us_state_vaccinations.csv'
vaccinations_data = pd.read_csv(vaccinations_path)
# Load the deaths data
deaths_path = 'time_series_covid19_deaths_US.csv'
deaths_data = pd.read_csv(deaths_path)
# Display the first few rows of each dataset
vaccinations_data.head(), deaths_data.head()
( date location total_vaccinations total_distributed \
0 2021-01-12 Alabama 78134.0 377025.0
1 2021-01-13 Alabama 84040.0 378975.0
2 2021-01-14 Alabama 92300.0 435350.0
3 2021-01-15 Alabama 100567.0 444650.0
4 2021-01-16 Alabama NaN NaN
people_vaccinated people_fully_vaccinated_per_hundred \
0 70861.0 0.15
1 74792.0 0.19
2 80480.0 NaN
3 86956.0 0.28
4 NaN NaN
total_vaccinations_per_hundred people_fully_vaccinated \
0 1.59 7270.0
1 1.71 9245.0
2 1.88 NaN
3 2.05 13488.0
4 NaN NaN
people_vaccinated_per_hundred distributed_per_hundred \
0 1.45 7.69
1 1.53 7.73
2 1.64 8.88
3 1.77 9.07
4 NaN NaN
daily_vaccinations_raw daily_vaccinations daily_vaccinations_per_million \
0 NaN NaN NaN
1 5906.0 5906.0 1205.0
2 8260.0 7083.0 1445.0
3 8267.0 7478.0 1525.0
4 NaN 7498.0 1529.0
share_doses_used total_boosters total_boosters_per_hundred
0 0.207 NaN NaN
1 0.222 NaN NaN
2 0.212 NaN NaN
3 0.226 NaN NaN
4 NaN NaN NaN ,
UID iso2 iso3 code3 FIPS Admin2 Province_State Country_Region \
0 84001001 US USA 840 1001.0 Autauga Alabama US
1 84001003 US USA 840 1003.0 Baldwin Alabama US
2 84001005 US USA 840 1005.0 Barbour Alabama US
3 84001007 US USA 840 1007.0 Bibb Alabama US
4 84001009 US USA 840 1009.0 Blount Alabama US
Lat Long_ ... 2/28/23 3/1/23 3/2/23 3/3/23 3/4/23 3/5/23 \
0 32.539527 -86.644082 ... 230 232 232 232 232 232
1 30.727750 -87.722071 ... 724 726 726 726 726 726
2 31.868263 -85.387129 ... 103 103 103 103 103 103
3 32.996421 -87.125115 ... 109 109 109 109 109 109
4 33.982109 -86.567906 ... 261 261 261 261 261 261
3/6/23 3/7/23 3/8/23 3/9/23
0 232 232 232 232
1 726 726 727 727
2 103 103 103 103
3 109 109 109 109
4 261 261 261 261
[5 rows x 1155 columns])
This section preprocesses the data by converting date columns to datetime format and aggregating deaths by state and date. The head() function is used again to inspect the processed deaths data.
import pandas as pd
# Convert date columns to datetime for both datasets
vaccinations_data['date'] = pd.to_datetime(vaccinations_data['date'])
# For the deaths data, extract dates from the columns and melt the dataframe
date_columns = deaths_data.columns[12:] # skipping non-date columns
deaths_data_melted = deaths_data.melt(id_vars=['Province_State'], value_vars=date_columns,
var_name='date', value_name='deaths')
deaths_data_melted['date'] = pd.to_datetime(deaths_data_melted['date'])
# Aggregate deaths by state and date
deaths_by_state_date = deaths_data_melted.groupby(['Province_State', 'date']).sum().reset_index()
# Inspect the processed data for deaths
deaths_by_state_date.head()
C:\Users\varsh\AppData\Local\Temp\ipykernel_25828\3480503501.py:9: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format. deaths_data_melted['date'] = pd.to_datetime(deaths_data_melted['date'])
| Province_State | date | deaths | |
|---|---|---|---|
| 0 | Alabama | 2020-01-22 | 0 |
| 1 | Alabama | 2020-01-23 | 0 |
| 2 | Alabama | 2020-01-24 | 0 |
| 3 | Alabama | 2020-01-25 | 0 |
| 4 | Alabama | 2020-01-26 | 0 |
This section checks the date ranges for both datasets, filters them to the overlapping date range, and merges them on state and date. It also renames some columns for clarity and inspects the merged dataset to ensure proper merging and alignment of data.
# Check the date ranges for both datasets
vaccination_date_range = (vaccinations_data['date'].min(), vaccinations_data['date'].max())
death_date_range = (deaths_by_state_date['date'].min(), deaths_by_state_date['date'].max())
vaccination_date_range, death_date_range
# Filter both datasets to the overlapping date range
start_date = pd.Timestamp('2020-12-20')
end_date = pd.Timestamp('2023-03-09')
filtered_vaccinations = vaccinations_data[(vaccinations_data['date'] >= start_date) & (vaccinations_data['date'] <= end_date)]
filtered_deaths = deaths_by_state_date[(deaths_by_state_date['date'] >= start_date) & (deaths_by_state_date['date'] <= end_date)]
# Merge the datasets on state and date
merged_data = pd.merge(filtered_vaccinations, filtered_deaths, left_on=['location', 'date'], right_on=['Province_State', 'date'], how='inner')
# Renaming some columns for clarity and inspecting the merged dataset
merged_data = merged_data.rename(columns={'location': 'state', 'people_vaccinated': 'total_people_vaccinated'})
merged_data.head()
| date | state | total_vaccinations | total_distributed | total_people_vaccinated | people_fully_vaccinated_per_hundred | total_vaccinations_per_hundred | people_fully_vaccinated | people_vaccinated_per_hundred | distributed_per_hundred | daily_vaccinations_raw | daily_vaccinations | daily_vaccinations_per_million | share_doses_used | total_boosters | total_boosters_per_hundred | Province_State | deaths | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-01-12 | Alabama | 78134.0 | 377025.0 | 70861.0 | 0.15 | 1.59 | 7270.0 | 1.45 | 7.69 | NaN | NaN | NaN | 0.207 | NaN | NaN | Alabama | 5573 |
| 1 | 2021-01-13 | Alabama | 84040.0 | 378975.0 | 74792.0 | 0.19 | 1.71 | 9245.0 | 1.53 | 7.73 | 5906.0 | 5906.0 | 1205.0 | 0.222 | NaN | NaN | Alabama | 5760 |
| 2 | 2021-01-14 | Alabama | 92300.0 | 435350.0 | 80480.0 | NaN | 1.88 | NaN | 1.64 | 8.88 | 8260.0 | 7083.0 | 1445.0 | 0.212 | NaN | NaN | Alabama | 5945 |
| 3 | 2021-01-15 | Alabama | 100567.0 | 444650.0 | 86956.0 | 0.28 | 2.05 | 13488.0 | 1.77 | 9.07 | 8267.0 | 7478.0 | 1525.0 | 0.226 | NaN | NaN | Alabama | 6030 |
| 4 | 2021-01-16 | Alabama | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 7498.0 | 1529.0 | NaN | NaN | NaN | Alabama | 6119 |
import pandas as pd
import plotly.graph_objects as go
# Assuming 'merged_data' is properly loaded with columns for 'state', 'date', 'total_people_vaccinated', and 'deaths'
# Aggregate data if necessary, here we assume it's already aggregated and formatted correctly
# Create an interactive plot with a dropdown menu
fig = go.Figure()
# Get the unique states for the dropdown and sorted by name
states = sorted(merged_data['state'].unique())
# Add all traces, initially set to invisible
for state in states:
# Filtering data for each state
state_data = merged_data[merged_data['state'] == state]
# Add trace for vaccination data
fig.add_trace(go.Scatter(
x=state_data['date'],
y=state_data['total_people_vaccinated'],
mode='lines',
name=f"{state} Vaccinations",
visible=False # Initially hidden
))
# Add trace for death data
fig.add_trace(go.Scatter(
x=state_data['date'],
y=state_data['deaths'],
mode='lines',
name=f"{state} Deaths",
visible=False # Initially hidden
))
# Set the first state's data to visible initially
fig.data[0].visible = True
fig.data[1].visible = True
# Update layout with a dropdown menu for selecting states
fig.update_layout(
title='COVID-19 Vaccinations and Deaths by State Over Time',
xaxis_title='Date',
yaxis_title='Number of Vaccinations/Deaths',
updatemenus=[{
'buttons': [
{
'label': state,
'method': 'update',
'args': [{'visible': [s == state for s in states for _ in (0, 1)] * 2},
{'title': f'COVID-19 Vaccinations and Deaths in {state}'}]
} for state in states
],
'direction': 'down',
'showactive': True,
'pad': {'r': 10, 't': 10},
'x': 0.1,
'xanchor': 'left',
'y': 1.1,
'yanchor': 'top'
}],
hovermode='closest'
)
fig.show()
C:\ProgramData\anaconda3\Lib\site-packages\_plotly_utils\basevalidators.py:106: FutureWarning: The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result